In [241]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Show Code"></form>''')
Out[241]:

Data Spaces Thesis

Antonio Alliegro, s250104, antonio.alliegro@studenti.polito.it

In [43]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# seaborn paper context
sns.set_context("paper", font_scale=1.4)
sns.set(style="whitegrid")
In [44]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import Contours, Histogram2dContour, Marker, Scatter, Data, Layout, Figure
import plotly.tools as tools

init_notebook_mode()
# Function def: needed for using Plotly in Google Colab
# Call it in each offline plotting cell
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        ''')) 
In [45]:
def GridSearch_table_plot(grid_clf, param_name,
                          num_results=15,
                          negative=True,
                          graph=True,
                          display_all_params=True):

    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']
    best_param = best_row['param_' + param_name]

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))

    # plot the results
    # ================
    scores_df = scores_df.sort_values(by='param_' + param_name)

    if negative:
        means = -scores_df['mean_test_score']
    else:
        means = scores_df['mean_test_score']
    stds = scores_df['std_test_score']
    params = scores_df['param_' + param_name]

    # plot
    if graph:
        plt.figure(figsize=(8, 8))
        plt.errorbar(params, means, yerr=stds)

        plt.axhline(y=best_mean + best_stdev, color='red')
        plt.axhline(y=best_mean - best_stdev, color='red')
        plt.plot(best_param, best_mean, 'or')

        plt.title(param_name + " vs Score\nBest Score {:0.5f}".format(clf_score))
        plt.xlabel(param_name)
        plt.ylabel('Score')
        plt.show()

Importing the Dataset

In [82]:
# dal dataset ho eliminato a mano 7 righe contenenti null values
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
df = pd.read_csv("processed_cleveland.data", names=features)
print(df.info())
print(df.describe())  # description of statistic features
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
age         297 non-null float64
sex         297 non-null float64
cp          297 non-null float64
trestbps    297 non-null float64
chol        297 non-null float64
fbs         297 non-null float64
restecg     297 non-null float64
thalach     297 non-null float64
exang       297 non-null float64
oldpeak     297 non-null float64
slope       297 non-null float64
ca          297 non-null float64
thal        297 non-null float64
target      297 non-null int64
dtypes: float64(13), int64(1)
memory usage: 32.6 KB
None
              age         sex          cp    trestbps        chol         fbs  \
count  297.000000  297.000000  297.000000  297.000000  297.000000  297.000000   
mean    54.542088    0.676768    3.158249  131.693603  247.350168    0.144781   
std      9.049736    0.468500    0.964859   17.762806   51.997583    0.352474   
min     29.000000    0.000000    1.000000   94.000000  126.000000    0.000000   
25%     48.000000    0.000000    3.000000  120.000000  211.000000    0.000000   
50%     56.000000    1.000000    3.000000  130.000000  243.000000    0.000000   
75%     61.000000    1.000000    4.000000  140.000000  276.000000    0.000000   
max     77.000000    1.000000    4.000000  200.000000  564.000000    1.000000   

          restecg     thalach       exang     oldpeak       slope          ca  \
count  297.000000  297.000000  297.000000  297.000000  297.000000  297.000000   
mean     0.996633  149.599327    0.326599    1.055556    1.602694    0.676768   
std      0.994914   22.941562    0.469761    1.166123    0.618187    0.938965   
min      0.000000   71.000000    0.000000    0.000000    1.000000    0.000000   
25%      0.000000  133.000000    0.000000    0.000000    1.000000    0.000000   
50%      1.000000  153.000000    0.000000    0.800000    2.000000    0.000000   
75%      2.000000  166.000000    1.000000    1.600000    2.000000    1.000000   
max      2.000000  202.000000    1.000000    6.200000    3.000000    3.000000   

             thal      target  
count  297.000000  297.000000  
mean     4.730640    0.946128  
std      1.938629    1.234551  
min      3.000000    0.000000  
25%      3.000000    0.000000  
50%      3.000000    0.000000  
75%      7.000000    2.000000  
max      7.000000    4.000000  

The 'target' field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Since we just want to classificate Heart Diseases Affected and Not Affected patients, we just consider:
Values 1, 2, 3, 4 as 1 ==> Heart Disease Affected Patient
Value 0 ==> Not Affected Patient.

In [83]:
# Target column mapping to {0, 1}
target_dict = {0 : 0, 1 : 1, 2 : 1, 3 : 1, 4 : 1}
new_target_col = df.target.map(target_dict)
df.drop(labels='target', axis="columns", inplace=True)
df['target'] = new_target_col

1. Data Visualization

Once imported the dataset, let's check data distribution. The objective is to gain some insight on features importance in predicting a patient affected or will be by heart diseases.
Which are the most important features to predict a future heart disease affected patient?

Boxplot (Method)

An efficient visual tool for displaying data distribution

The box plot is a method for graphically displaying the distribution of data based on the five number summary: minimum, first quartile, median, third quartile, and maximum.
See picture, the central rectangle spans the first quartile to the third quartile (the interquartile range or IQR).
A segment inside the rectangle shows the median while "whiskers" above and below the box show the locations of the minimum and maximum.

Not uncommonly real datasets will display surprisingly high maximums or surprisingly low minimums called outliers.
John Tukey definition for outliers:
Outliers are either 3×IQR or more above the third quartile or 3×IQR or more below the first quartile.
Suspected outliers are are slightly more central versions of outliers: either 1.5×IQR or more above the third quartile or 1.5×IQR or more below the first quartile.
If either type of outlier is present the whisker on the appropriate side is taken to 1.5×IQR from the quartile (the "inner fence") rather than the max or min, and individual outlying data points are displayed as unfilled circles (for suspected outliers) or filled circles (for outliers). (The "outer fence" is 3×IQR from the quartile.)

Target Count Plot

This feature distinguish between an healthy man and one affected by heart diseases. In the original dataset (from UCI) this was a cathegorical attribute assuming value from 0 to 4. Where 0 meant no heart diseases at all and numbers from 1 to 4 meant presence of heart diseases.

In [48]:
flatui_simple = ["#3498db", "#e74c3c"]
ax=sns.countplot(x='target',data=df, palette=sns.color_palette(flatui_simple));
ax.set_xticklabels(labels=['Not Affected','Affected'], fontsize=14)
ax.set_xlabel('')
ax.set_ylabel("Count",fontsize=14)

for p in ax.patches:
    ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 3), textcoords = 'offset points', fontsize = 15)
In [121]:
affected = df[(df["target"]==1)].count()[0]
not_affected = df[(df["target"]==0)].count()[0]
tot = df.count()[0]

import matplotlib.pyplot as plt
# Pie chart
labels = ['Not Affected', 'Affected']
sizes = [not_affected/tot, affected/tot]
#colors
flatui_simple = ["#3498db", "#e74c3c"]
 
fig1, ax1 = plt.subplots()
ax1.pie(sizes, colors = flatui_simple, labels=labels, autopct='%1.1f%%', startangle=90,
        textprops={'fontsize': 14})
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.show()

Note that the majority of samples in our dataset are about people who never had some kind of heart disease.
We have 160 samples of Not Affected vs 137 of Affected People. By the way, this is not a strong predominance.

Sex Distribution

In [50]:
sex_pal = ["#ff96ef", "#68bfff"]
ax = sns.countplot(x='sex',data=df, palette=sns.color_palette(sex_pal));
ax.set_xticklabels(labels=['Female','Male'], fontsize=14);
ax.set_xlabel("Sex",fontsize=14); ax.set_ylabel("Count",fontsize=14);

for p in ax.patches:
    ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 4), textcoords = 'offset points', fontsize = 15)

Dataset samples are for the majority relative to male patients. Male are more than twice the Female. Our data distribution is biased.

Affected/Not Affected count w.r.t Sex

In [51]:
ax1 = sns.catplot(x='sex', col='target', kind='count', data=df, palette=sns.color_palette(sex_pal), estimator=lambda x: len(x));
ax1.set(xticks=[0, 1], xticklabels=['Female', 'Male']);
ax1.set_xticklabels(fontsize=14);
In [52]:
# 1 is for male
# 0 is for female
aff_male = df[(df["target"]==1) & (df["sex"]==1)].count()[0]
aff_female = df[(df["target"]==1) & (df["sex"]==0)].count()[0]
num_male = df[df["sex"]==1].count()[0]
num_female = df[df["sex"]==0].count()[0]
print("Total number of Male subjects is: %d" % num_male)
print("Total number of Female subjects is: %d" % num_female)
print("Number of Heart Diseases Affected male is: %d on %d" % (aff_male, num_male))
print("Number of Heart Diseases Affected female is: %d on %d" % (aff_female, num_female))
male_rate = 100*(aff_male/num_male)  # affected rate for male
female_rate = 100*(aff_female/num_female)  # affected rate for female
print("")
print("Percentage of affected Male: %.02f%% " % male_rate)
print("Percentage of affected Female: %.02f%% " % female_rate)
Total number of Male subjects is: 201
Total number of Female subjects is: 96
Number of Heart Diseases Affected male is: 112 on 201
Number of Heart Diseases Affected female is: 25 on 96

Percentage of affected Male: 55.72% 
Percentage of affected Female: 26.04% 

As we expected the most affected by heart diseases are Male subjects. Note that even if our dataset has more than twice male samples w.r.t female ones, more than the half man samples have some kind of heart disease.

Age distribution

In [53]:
# rug plot draws a small vertical tick at each observation
sns.distplot(df['age'], kde=True, rug=True);

Age-Target

Quantitative data that we have used is the age, and we are comparing it to the categorical variable target.

In [54]:
sns.boxplot(x='target',y='age',data=df, palette=sns.color_palette(flatui_simple));

Is interesting to note that the majority of heart diseases affected patient have an AVG age (~60)
Seems that Heart Diseases are more likely to affect people around sixty years old. From the boxplot is also evident the presence of outliers in Target=1 case.
The majority of Target-1 outliers have an age between 25 and 40 years old.

Thalach

Thalach attribute represents the Maximum heart rate achieved during thalium stress test.

In [55]:
sns.distplot(df['thalach'], kde=True, rug=False);

Thalach-Target

In [56]:
# 'thalach' represents Thalium Test maximum heart rate achieved
sns.boxplot(x='target',y='thalach',data=df, palette=sns.color_palette(flatui_simple));

Trestbps

Resting blood pressure. Measured in (mm Hg).

In [57]:
sns.distplot(df['trestbps'], kde=True, rug=False);

Trestbps-Target

In [58]:
sns.boxplot(x=df['target'], y=df['trestbps'], palette=sns.color_palette(flatui_simple));

Trestbps seems not to be an important feature to discriminate between H.D. affected and Not affected patients.

Chol

Serum cholesterol. Measured in (mg/dl).

In [59]:
sns.distplot(df['chol'], kde=True, rug=False);

Chol-Target

In [60]:
sns.boxplot(x=df['target'], y=df['chol'], palette=sns.color_palette(flatui_simple));

CP: Chest Pain Type

From UCI Dataset description, this feature can assume 4 different values listed below:

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic

In [61]:
cp_unique = df.cp.unique()
cp_unique.sort()
# counting
print("'Chest Pain Type' feature unique values are: ", cp_unique)
df.cp.value_counts()
'Chest Pain Type' feature unique values are:  [1. 2. 3. 4.]
Out[61]:
4.0    142
3.0     83
2.0     49
1.0     23
Name: cp, dtype: int64
In [62]:
ax = sns.countplot(x='cp',data=df, palette=sns.light_palette("navy"));
ax.set_xticklabels(labels=['Typical A.','Atypical A.', 'Non-anginal P.', 'Asymptomatic']);
ax.set_xlabel("Chest Pain Type",fontsize=14); ax.set_ylabel("Count",fontsize=14);

CP with respect to Target

In [63]:
k = sns.catplot(x='cp', col='target', kind='count', data=df,  palette=sns.light_palette("navy"));
k.set(xticks=[0, 1, 2, 3], xticklabels=['1-Typical a.', '2-Atypical a.', '3-Non-anginal', '4-Asymptomatic']);

TODO: Considerations on Chest Pain Type here!

Thal

Stands for Thalium stress test result (normal, fixed defect, or reversible defect)

Value 3: normal
Value 6: fixed defect
Value 7: reversible defect

In [64]:
thal_unique = df.thal.unique()
thal_unique.sort()
print("'Thal' feature unique values are: ", thal_unique)
print()
df.thal.value_counts()
'Thal' feature unique values are:  [3. 6. 7.]

Out[64]:
3.0    164
7.0    115
6.0     18
Name: thal, dtype: int64
In [65]:
n = sns.countplot(x='thal',data=df, palette=sns.light_palette("navy"));
n.set_xticklabels(labels=['Normal', 'Fixed Defect', 'Reversible Defect']);

#n.set_xticklabels(ax.get_xticklabels(), rotation=10, ha="right")  # xtickslabels rotation
#n.set_xticklabels(ax.get_xticklabels(), fontsize=10)  # xtickslabels font size

n.set_xlabel("Thal Test Result",fontsize=14); n.set_ylabel("Count",fontsize=14);
In [66]:
m = sns.catplot(x='thal', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
m.set(xticklabels=['normal', 'fixed\ndefect', 'Reversible\ndefect']);
m.set_xticklabels(fontsize=12);

It's evident from the histograms above that the majority of Heart Disease affected samples are characterized by a Thalium Stress Test Result of 'type 7' - Reversible Defect.

FBS

Fasting Blood Sugar (0 if < 120 mg/dl, 1 if > 120 mg/dl)

In [67]:
x = sns.catplot(x='fbs', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
x.set(xticks=[0, 1], xticklabels=['<120 mg/dl', '>=120 mg/dl']);
x.set_xticklabels(fontsize=12);

Restecg

Resting electrocardiographic results. Values:

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

In [68]:
restecg_unique = df.restecg.unique()
restecg_unique.sort()
print("'restecg' feature unique values are: ", restecg_unique)
print()
df.restecg.value_counts()
'restecg' feature unique values are:  [0. 1. 2.]

Out[68]:
0.0    147
2.0    146
1.0      4
Name: restecg, dtype: int64
In [69]:
x = sns.catplot(x='restecg', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
x.set(xticks=[0, 1, 2], xticklabels=['normal', 'ST-t Wave\nabnormality', 'likely left v.\nhypertrophy']);
x.set_xticklabels(fontsize=12);
In [70]:
tot_abn = (df['restecg'] == 1).sum()
tot_abn_affected = ((df['target'] == 1) & (df['restecg'] == 1)).sum()
print("Number of Patient having ST-T wave abnormality: ", tot_abn)
print("Number of Patient having ST-T wave abnormality that are Affected: ", tot_abn_affected)
Number of Patient having ST-T wave abnormality:  4
Number of Patient having ST-T wave abnormality that are Affected:  3

Note that our dataset contains only 4 samples in which Resting Electrocardiographic result (restecg) is of type 1, so having ST-T wave abnormality.
Of these 4 samples, 3 are about heart disease affected patients (75%).

Exang *

Exercise induced angina (1=yes or 0=no)

In [71]:
x = sns.catplot(x='exang', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
x.set(xticks=[0, 1], xticklabels=['Exang=0', 'Exang=1']);
x.set_xticklabels(fontsize=12);

Oldpeak *

ST depression induced by exercise relative to rest

In [72]:
sns.distplot(df['oldpeak'], kde=True, rug=False);

Slope

The slope of the peak exercise ST segment. Heart Rate slope during peak exercise? Values:

Value 1: downsloping
Value 2: flat
Value 3: upsloping

In [73]:
l=sns.catplot(x='slope', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
l.set(xticklabels=['upsloping', 'flat', 'downsloping']);
l.set_xticklabels(fontsize=12);

CA

When fluoroscopy is used during a cardiac catheterization, the physician can see how blood is moving through the blood vessels and where there are blockages.
Number of major vessels (0-3) colored by flourosopy

In [74]:
m=sns.catplot(x='ca', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
m.set_xticklabels(fontsize=12);

Correlation Matrix

In [75]:
plt.figure(figsize=(20,20))
g=sns.heatmap(df.corr(), annot = True, cmap='Blues',linewidths=.1)
plt.show()
In [ ]:
 
In [ ]:
 

Violin plot is a variation of box plot. It shows the distribution of quantitative data across several levels of categorical variables. The violin plot also features a kde of the underlying distribution. Violin plot is more informative than a plain box plot. While a box plot only shows summary statistics such as mean/median and interquartile ranges, the violin plot shows the full distribution of the data.
Thalac is compared against target. ..
see: https://datavizcatalogue.com/methods/violin_plot.html

In [76]:
sns.violinplot(x='target', y='thalach', data=df, palette=sns.color_palette(flatui_simple));
In [77]:
sns.violinplot(x='target', y='trestbps', data=df, palette=sns.color_palette(flatui_simple));
In [124]:
sns.violinplot(x='target', y='age', data=df, palette=sns.color_palette(flatui_simple));

2. Preprocessing

In [78]:
pd.options.display.max_colwidth = 150
features_table = pd.read_csv("features_heart.csv")
display(features_table)
Extracted Feature Type Description
0 age Continuous Age in years
1 sex Binary 1 = male; 0 = female
2 cp Categorical Chest pain type (typical angina, atypical angina, non-anginal, asymptomatic angina)
3 trestbps Continuous Resting blood pressure (in mm Hg)
4 chol Continuous Serum cholesterol (in mg/dl)
5 fbs Binary Fasting blood sugar (0 if < 120 mg/dl, 1 if > 120 mg/dl)
6 restecg Categorical 0:normal, 1: ST-T wave abnormality, 2:left ventricular hypertrophy
7 thalach Continuous Maximum heart rate achieved during thalium stress test
8 exang Binary Exercise induced angina (1 = yes, 0 = no)
9 oldpeak Continuous ST depression induced by exercise relative to rest
10 slope Categorical Slope of peak exercise (upsloping, flat, downsloping)
11 ca Continuous Number of major vessels colored by fluoroscopy (0-3)
12 thal Categorical Thalium stress test result (normal, fixed defect, reversible defect)
13 num Binary Target (1=affected, 0=not affected)

Onehot Encoding

As we have seen our dataset contains some categorical data. In our specific case each cat. data value represents a different category, with no ordinal relationship. Some algorithms can work directly with categorical data others don't.
e.g Decision trees can be learned directly from categorical data with no data transformation required (note that this depends on the specific implementation).
At the same time many machine learning algortihms do not handle categorical features at all. They require all input variables and output variables to be numeric. In this case we have to preprocess manually the categorical features, in order to have them in an appropriate format for the machine learning model (usually: numeric features).

Because of the lack of ordinal relationship between categories, I don't want the model to assume a natural order between categories. For this reason I've chosen to use OneHot Encoding Method.
One hot encoding is a representation of categorical variables as binary vectors.

  1. You have as many columns as you have cardinalities (values) in the categorical variable.
  2. You have a bunch of zeroes and only few 1s! (one 1 per new feature)

OneHot Encoding Example:

Our dataset categorical features are:

  1. cp
  2. restecg
  3. slope
  4. thal
In [84]:
# pandas.get_dummies() is actually OneHot encoding
categorical_features = ['cp', 'restecg', 'slope', 'thal']
for feature in categorical_features:
    num_classes = len(df[feature].unique())
    curr_onehot = pd.get_dummies(df[feature], prefix='en_' + feature)
    df = df.drop(feature, axis=1)
    df = df.join(curr_onehot)
    

Dataset normalization

In [85]:
from sklearn.preprocessing import StandardScaler
X = df[df.columns[:-1]].values  # from encoded dataframe
y = df["target"].values
# Fai vedere come non normalizzando c'è outlier a (300, -10)
X_std = StandardScaler(with_mean=True, with_std=True).fit_transform(X)  # Normalization: Mean=0; Variance=1

Principle Component Analysis

In [86]:
from sklearn.decomposition import PCA
import matplotlib.patches as mpatches

pca, pca1 = PCA(n_components=2), PCA(n_components=2) # all components
X_t = pca.fit_transform(X_std)  # with normalization
X_t1 = pca1.fit_transform(X)  # without normalization, presence of Outliers, this impact on PC1 p.v.e

# Label to color dict (manual)
label_color_dict = {0: 'blue', 1: 'red'}
# Color vector creation
cvec = [label_color_dict[label] for label in y]
# Legend
negative_patch = mpatches.Patch(color='blue', label='Not affected')
positive_patch = mpatches.Patch(color='red', label='Affected')
# 1st and 2nd Components Scatter
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(121)
ax.set_xlabel('1st PC (%.2f%%)' % (pca.explained_variance_ratio_[0] * 100))
ax.set_ylabel('2nd PC (%.2f%%)' % (pca.explained_variance_ratio_[1] * 100))
ax.set_title('PCA, on Normalized Dataset', fontsize=16)
ax.scatter(X_t[:, 0], X_t[:, 1], c=cvec, edgecolor='b', alpha=0.85, s=100)
ax.legend(handles=[negative_patch, positive_patch])
ax1 = fig.add_subplot(122)
ax1.set_xlabel('1st PC (%.2f%%)' % (pca1.explained_variance_ratio_[0] * 100))
ax1.set_ylabel('2nd PC (%.2f%%)' % (pca1.explained_variance_ratio_[1] * 100))
ax1.set_title('PCA, not Normalized Dataset', fontsize=16)
ax1.scatter(X_t1[:, 0], X_t1[:, 1], c=cvec, edgecolor='b', alpha=0.85, s=100)
ax1.legend(handles=[negative_patch, positive_patch])
plt.show()

2. Classification

Introduction to cross validation!

Utility Functions for decision regions plotting:

In [87]:
def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in
    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional
    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.
    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

2.1 K-Nearest Neighbors

In [122]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.30, random_state=0)
print("X_train shape: ", X_train.shape);
print("X_test shape: ", X_test.shape);
X_train shape:  (207, 22)
X_test shape:  (90, 22)
In [111]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

k_values = [1, 2, 3, 10, 20, 30, 40, 50, 70, 100, 189]

#weights_values = ["uniform", "distance"]
#tuned_parameters = {'n_neighbors': k_values, 'weights': weights_values}

tuned_parameters = {'n_neighbors': k_values}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, tuned_parameters , cv=12, refit=True, return_train_score=True)
knn_cv.fit(X_train, y_train)

print('GridSearchCV 12-Fold on Training Set')
print('CV- Best Parameters are: ', knn_cv.best_params_)

# Mean cross-validated score of the best_estimator
accuracy = 100 * float(knn_cv.best_score_)
print('CV - Best Accuracy Score (AVG on all folds scores) is: %.03f%% ' % accuracy)

# best estimator is accessible by "clf.best_estimator_" if refit=TRUE
print()
print("Using best parameters retrieved by 12-fold CV on Test Set:")
y_pred = knn_cv.best_estimator_.predict(X_test)
test_score = 100 * metrics.accuracy_score(y_test, y_pred)
print("knn_cv - With best params from GridSearchCV [n_neighbors=%d]\nAccuracy Score on Test Set is %.2f%%" %
      (knn_cv.best_params_['n_neighbors'], test_score))
GridSearchCV 12-Fold on Training Set
CV- Best Parameters are:  {'n_neighbors': 10}
CV - Best Accuracy Score (AVG on all folds scores) is: 91.787% 

Using best parameters retrieved by 12-fold CV on Test Set:
knn_cv - With best params from GridSearchCV [n_neighbors=10]
Accuracy Score on Test Set is 90.00%
In [125]:
GridSearch_table_plot(knn_cv, "n_neighbors", negative=False);
best parameters: {'n_neighbors': 10}
best score:      0.91787 (+/-0.06367)
{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'uniform'}
mean_fit_time std_fit_time mean_score_time std_score_time param_n_neighbors params split0_test_score split1_test_score split2_test_score split3_test_score ... split4_train_score split5_train_score split6_train_score split7_train_score split8_train_score split9_train_score split10_train_score split11_train_score mean_train_score std_train_score
3 0.000289 0.000029 0.000910 0.000076 10 {'n_neighbors': 10} 1.000000 0.944444 0.777778 0.944444 ... 0.936842 0.931579 0.931579 0.931579 0.915789 0.926316 0.931579 0.926702 0.930177 0.007127
9 0.000207 0.000013 0.001027 0.000103 100 {'n_neighbors': 100} 1.000000 1.000000 0.833333 0.888889 ... 0.889474 0.910526 0.910526 0.915789 0.900000 0.921053 0.915789 0.931937 0.913921 0.011236
6 0.000310 0.000041 0.000998 0.000059 40 {'n_neighbors': 40} 0.944444 1.000000 0.777778 0.833333 ... 0.910526 0.910526 0.910526 0.926316 0.910526 0.915789 0.915789 0.921466 0.917441 0.007178
7 0.000278 0.000005 0.000971 0.000019 50 {'n_neighbors': 50} 0.944444 1.000000 0.833333 0.833333 ... 0.905263 0.900000 0.905263 0.910526 0.905263 0.894737 0.905263 0.905759 0.906465 0.006730
8 0.000285 0.000018 0.000992 0.000018 70 {'n_neighbors': 70} 1.000000 1.000000 0.777778 0.833333 ... 0.910526 0.910526 0.915789 0.921053 0.905263 0.910526 0.915789 0.910995 0.913046 0.006606
5 0.000314 0.000038 0.001015 0.000129 30 {'n_neighbors': 30} 0.944444 1.000000 0.777778 0.833333 ... 0.905263 0.910526 0.915789 0.921053 0.894737 0.910526 0.915789 0.905759 0.910853 0.008121
4 0.000278 0.000009 0.000915 0.000035 20 {'n_neighbors': 20} 1.000000 0.944444 0.777778 0.888889 ... 0.915789 0.915789 0.921053 0.931579 0.905263 0.915789 0.915789 0.910995 0.916119 0.006942
2 0.000305 0.000041 0.000882 0.000051 3 {'n_neighbors': 3} 0.944444 1.000000 0.833333 0.944444 ... 0.947368 0.952632 0.957895 0.947368 0.947368 0.957895 0.963158 0.952880 0.952566 0.006111
1 0.000279 0.000005 0.000860 0.000043 2 {'n_neighbors': 2} 1.000000 0.833333 0.777778 0.944444 ... 0.942105 0.936842 0.931579 0.947368 0.931579 0.936842 0.942105 0.931937 0.936317 0.005049
0 0.000394 0.000057 0.001137 0.000148 1 {'n_neighbors': 1} 1.000000 0.888889 0.777778 0.944444 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000
10 0.000217 0.000022 0.001154 0.000156 189 {'n_neighbors': 189} 0.555556 0.555556 0.555556 0.555556 ... 0.542105 0.542105 0.542105 0.542105 0.542105 0.542105 0.542105 0.539267 0.541061 0.001240

11 rows × 35 columns

Consideration on above plot:
By increasing the number of n_neighbors considered classification perfomance of our model decreases.
At some point, depending on different class samples cardinality in our dataset, our model will always choose for the majority class, which is 'Not Affected' (160 not affected vs 137 affected).
Since Not Affected and Affected classes represent respectively ~54% and ~46% of samples, at that point ( n_neighbors >= 189 if we use 30% test split size ) our KNN model accuracy will be ~0.54%, predicting always not affected.

Number of folds for CrossValidation: 12
mean_test_score is the average of Test Scores on all k folds.
mean_train_score is the average of Train Scores on all k folds.

In [128]:
#['params', 'rank_test_score', 'mean_train_score', 'mean_test_score'])
df_cv = pd.DataFrame(knn_cv.cv_results_)
cv_mean_train_scores = 100*(df_cv['mean_train_score'].values)  # df is actually sorted by "param_n_neighbors"
cv_mean_test_scores = 100*(df_cv['mean_test_score'].values)
df_cv = df_cv[['param_n_neighbors', 'rank_test_score','mean_train_score','mean_test_score']].sort_values(by=['rank_test_score'])
display(df_cv)
param_n_neighbors rank_test_score mean_train_score mean_test_score
3 10 1 0.930177 0.917874
9 100 1 0.913921 0.917874
6 40 3 0.917441 0.913043
7 50 3 0.906465 0.913043
8 70 3 0.913046 0.913043
5 30 6 0.910853 0.908213
4 20 7 0.916119 0.903382
2 3 8 0.952566 0.893720
1 2 9 0.936317 0.869565
0 1 10 1.000000 0.859903
10 189 11 0.541061 0.541063

When doing k-fold cross-validation, we train k models, each one leaving 1/𝑘 portion of the data out. For each of the models, its train error and validation error is computed. The train error will be the error on the data selected to train the model, and the validation error will be the data left out of the training.
For this reason, for each parameters set, we'll have k training errors and k validation/test errors, and computing their averages: mean_train_score and mean_test_score.


**Cannot plot decision regions if we are in a space with higher dimensionality than 2d!**

In [ ]:
import numpy as np
# Plotting decision regions
from matplotlib.colors import ListedColormap
h = .02  # step size in the mesh
# two class classification target[0, 1]
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])
x_min, x_max = X_t[:, 0].min() - 1, X_t[:, 0].max() + 1
y_min, y_max = X_t[:, 1].min() - 1, X_t[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                    np.arange(y_min, y_max, h))
Z = knn_cv.best_estimator_.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(9, 6))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot the training points
plt.scatter(X_t[:, 0], X_t[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("knn classification (k = %d)" % knn_cv.best_params_['n_neighbors'])
plt.show()

Scores on Test Set for different K values

In [138]:
k_test_scores = []  # scores on Test Set for different K values, with 'uniform' weights
clf_vect = []  # vector containing fitted classifier for each k value
for k in k_values:
    curr_clf = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    curr_clf.fit(X_train, y_train)
    clf_vect.append(curr_clf)
    y_pred = curr_clf.predict(X_test)
    score = 100 * metrics.accuracy_score(y_test, y_pred)
    k_test_scores.append(score)
In [143]:
# Plotly graph
configure_plotly_browser_state()
init_notebook_mode(connected=False)

trace0 = Scatter(
    x = k_values,
    y = k_test_scores,
    mode = 'lines+markers',
    name = '<b>Score on Test Set</b>',
    line = dict(
        color = ('rgb(255,0,0)'), # red
        width = 4)
      )

trace1 = Scatter(
    x = k_values,
    y = cv_mean_train_scores,
    #mode = 'lines+markers',
    mode = 'lines',
    name = 'Mean Train Score',
    line = dict(
        color = ('rgb(49,130,189)'), # light blue
        width = 4)
      )
trace2 = Scatter(
    x = k_values,
    y = cv_mean_test_scores,
    mode = 'lines+markers',
    name = 'Mean Test Score',
    line = dict(
        color = ('rgb(47,79,79)'), # grey
        width = 3)
      )
In [153]:
# subplot style
fig = tools.make_subplots(rows=3, cols=1,subplot_titles=
                          ('<b>Cross-Validation Train Scores</b>',
                           '<b>Cross-Validation Test Scores</b>',
                           '<b>Test Set Scores</b>'), shared_xaxes=True)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace0, 3, 1)
fig['layout'].update(title='KNN Scores', height=900) # in there height=600, width=600,
fig['layout']['xaxis'].update(title='n_neighbors') #shared axis X
iplot(fig, filename='stacked-knn-scores')
This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]
[ (3,1) x1,y3 ]

In [154]:
# Oppure: same plot
data = [trace0, trace1, trace2]
layout = dict(title = 'KNN Scores',
              xaxis = dict(title = 'K value'),
              yaxis = dict(title = 'Score'),
              )
fig = dict(data=data, layout=layout)
iplot(fig, filename='k_score_plot')

**Cannot plot decision regions if we are in a space with higher dimensionality than 2d!**

Decision Region for each K Value

In [ ]:
import numpy as np
# Plotting decision regions
from matplotlib.colors import ListedColormap
h = .02  # step size in the mesh
# Create color maps
# cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
# cmap_bold = ListedColormap(['#FF0000', '#0000FF', '#00FF00'])
# two class classification target[0, 1]
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])

for (k, clf) in zip(k_values, clf_vect):
    # we create an instance of Neighbours Classifier and fit the data.
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X_t[:, 0].min() - 1, X_t[:, 0].max() + 1
    y_min, y_max = X_t[:, 1].min() - 1, X_t[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X_t[:, 0], X_t[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("%d-NN Decision Boundary" % k)

Consideration on KNN Decision Boundaries

K-nearest neighbors works assigning a label to a sample depending on the k nearest neighbors data points labels.
Once found the group of k nearest training samples to our target data point (can be a test sample/point) the label given to the target will be the most spread label among all the k nearest. Exactly like a majority voting!
With K increasing to infinity the margin will be all blue or all red depending on the total majority.
fonte: https://www.analyticsvidhya.com/blog/2018/03/introduction-k-neighbours-algorithm-clustering/

2.2 Support Vector Machine

2.1. Feature Extraction As the dimension (number of features) increase, it will be harder to do the data mining task such as classification. To solve that problem, Feature Extraction namely PCA which extract original features into a new features using mapping function is used. PCA processes include: data centering, calculate covarian matrix, calculate Eigenvector and Eigenvalue, select top Eigenvector, and transformation data.
2.2. Classification
Support Vector Machine (SVM) is well known as a classifier which can model complex data, has good accuracy and less prone to overfitting. SVM works by searching the linear optimal separating hyperplane (decision boundary). The rationale is that decision boundary with large margin is better when handling unseen data compared to decision boundary with small margin. When the data are not linearly separable, SVM transform original data into a higher dimension using a nonlinear mapping to obtain the separating hyperplane.
see ref.: https://iopscience.iop.org/article/10.1088/1742-6596/971/1/012003/pdf

In [158]:
# Considering only first two principal components
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size=0.30, random_state=0)
print(X_train.shape); print(y_train.shape)
(207, 2)
(207,)

Example of different kernel and decision boundaries

In [164]:
from sklearn import svm

C = 1.0  # SVM regularization -hyperparameter
# training different SVM models
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))

models = (clf.fit(X_t, y) for clf in models)
# title for the plots
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3)')

fig, sub = plt.subplots(2, 2, figsize=(14,9))
fig.suptitle('SVM Models Comparison', size=16) # or plt.suptitle('Main title')
plt.subplots_adjust(wspace=0.2, hspace=0.2)
X0, X1 = X_t[:, 0], X_t[:, 1]
xx, yy = make_meshgrid(X0, X1)
for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(()) 
    ax.set_yticks(())
    ax.set_title(title)
plt.show()

SVM parameters Tuning - CrossValidation

In [227]:
# Tuning Parameters
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [10, 1, 1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['sigmoid'], 'gamma': [10, 1, 1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
                   ]
In [228]:
# GridSearchCV
# performing 12-fold validation
svm_cv = GridSearchCV(svm.SVC(), tuned_parameters, cv=12, refit=True, return_train_score=True)
svm_cv.fit(X_train, y_train)
print('GridSearchCV 12-Fold on Training Set.\nBest_params= %s with best_score_= %.03f'%
      (svm_cv.best_params_, 100*float(svm_cv.best_score_)))
GridSearchCV 12-Fold on Training Set.
Best_params= {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} with best_score_= 89.855
In [216]:
#['params', 'rank_test_score', 'mean_train_score', 'mean_test_score'])
df_svm_cv = pd.DataFrame(svm_cv.cv_results_)
# selecting only the interesting columns
df_svm_cv = df_svm_cv[['params', 'rank_test_score','mean_train_score','mean_test_score']].sort_values(by=['rank_test_score'])
# cv_svm_train_scores = 100*(df_svm_cv['mean_train_score'].values)
# cv_sv_test_scores = 100*(df_svm_cv['mean_test_score'].values)
df_svm_cv.head()
Out[216]:
params rank_test_score mean_train_score mean_test_score
63 {'C': 1000, 'gamma': 1e-05, 'kernel': 'sigmoid'} 1 0.884061 0.898551
12 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} 1 0.888898 0.898551
22 {'C': 50, 'gamma': 0.0001, 'kernel': 'rbf'} 1 0.884061 0.898551
58 {'C': 100, 'gamma': 0.0001, 'kernel': 'sigmoid'} 1 0.884061 0.898551
45 {'C': 10, 'gamma': 0.001, 'kernel': 'sigmoid'} 1 0.884061 0.898551
In [203]:
# best estimator is accessible by "clf2.best_estimator_" if refit=TRUE
y_pred = svm_cv.best_estimator_.predict(X_test)
score = 100 * metrics.accuracy_score(y_test, y_pred)
print("SVM CV - best params from GridSearchCV are: [Kernel: %s, C: %.3f , gamma: %.3f]\nAccuracy Score on Test Set is: %.2f%%" %
      (svm_cv.best_params_['kernel'], svm_cv.best_params_['C'], svm_cv.best_params_['gamma'], score))
SVM CV - best params from GridSearchCV are: [Kernel: rbf, C: 10.000 , gamma: 0.010]
Accuracy Score on Test Set is: 83.33%

Model trained on "best" CrossValidated parameters

In [207]:
# Plotting decision boundaries:
X0, X1 = X_train[:, 0], X_train[:, 1]
xx, yy = make_meshgrid(X0, X1)
fig2 = plt.figure(2, figsize=(10, 7))
ax3 = fig2.add_subplot(111)
plot_contours(ax3, svm_cv.best_estimator_, xx, yy,  # using Best estimator
              cmap=plt.cm.coolwarm, alpha=0.8)
ax3.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax3.set_xlim(xx.min(), xx.max())
ax3.set_ylim(yy.min(), yy.max())
# scores x label
ax3.set_xlabel('Sepal length\nMean Test Score (12-fold): %.03f ; Score on Test Set: %.03f' % (svm_cv.best_score_, score), fontsize=13)
ax3.set_ylabel('Sepal width', fontsize=13)
ax3.set_xticks(())
ax3.set_yticks(())
ax3.set_title(
    'GridSearchCV %d-Fold Validation\n Kernel: %s, C = %.02f, gamma = %.02f' % (12, svm_cv.best_params_['kernel'] ,svm_cv.best_params_['C'], svm_cv.best_params_['gamma']), fontsize=15);

Considerations

SVM: Boundaries Analysis

In [229]:
df_svm_cv # cv parameters combination
Out[229]:
params rank_test_score mean_train_score mean_test_score
63 {'C': 1000, 'gamma': 1e-05, 'kernel': 'sigmoid'} 1 0.884061 0.898551
12 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} 1 0.888898 0.898551
22 {'C': 50, 'gamma': 0.0001, 'kernel': 'rbf'} 1 0.884061 0.898551
58 {'C': 100, 'gamma': 0.0001, 'kernel': 'sigmoid'} 1 0.884061 0.898551
45 {'C': 10, 'gamma': 0.001, 'kernel': 'sigmoid'} 1 0.884061 0.898551
53 {'C': 50, 'gamma': 0.001, 'kernel': 'sigmoid'} 6 0.889332 0.893720
66 {'C': 0.1, 'kernel': 'linear'} 6 0.886700 0.893720
65 {'C': 0.1, 'kernel': 'linear'} 6 0.886700 0.893720
49 {'C': 25, 'gamma': 0.001, 'kernel': 'sigmoid'} 6 0.886257 0.893720
30 {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'} 6 0.885820 0.893720
17 {'C': 25, 'gamma': 0.001, 'kernel': 'rbf'} 6 0.889770 0.893720
62 {'C': 1000, 'gamma': 0.0001, 'kernel': 'sigmoid'} 6 0.886700 0.893720
44 {'C': 10, 'gamma': 0.01, 'kernel': 'sigmoid'} 6 0.886700 0.893720
26 {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'} 6 0.885380 0.893720
21 {'C': 50, 'gamma': 0.001, 'kernel': 'rbf'} 6 0.886700 0.893720
57 {'C': 100, 'gamma': 0.001, 'kernel': 'sigmoid'} 6 0.886700 0.893720
31 {'C': 1000, 'gamma': 1e-05, 'kernel': 'rbf'} 6 0.885380 0.893720
25 {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'} 6 0.886702 0.893720
13 {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'} 6 0.885818 0.893720
48 {'C': 25, 'gamma': 0.01, 'kernel': 'sigmoid'} 20 0.884507 0.888889
16 {'C': 25, 'gamma': 0.01, 'kernel': 'rbf'} 20 0.891977 0.888889
29 {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'} 20 0.885818 0.888889
56 {'C': 100, 'gamma': 0.01, 'kernel': 'sigmoid'} 23 0.887139 0.884058
61 {'C': 1000, 'gamma': 0.001, 'kernel': 'sigmoid'} 23 0.885820 0.884058
54 {'C': 50, 'gamma': 0.0001, 'kernel': 'sigmoid'} 23 0.879235 0.884058
52 {'C': 50, 'gamma': 0.01, 'kernel': 'sigmoid'} 23 0.886257 0.884058
18 {'C': 25, 'gamma': 0.0001, 'kernel': 'rbf'} 23 0.878796 0.884058
70 {'C': 100, 'kernel': 'linear'} 28 0.884061 0.874396
71 {'C': 1000, 'kernel': 'linear'} 28 0.884500 0.874396
27 {'C': 100, 'gamma': 1e-05, 'kernel': 'rbf'} 28 0.875287 0.874396
... ... ... ... ...
40 {'C': 0.1, 'gamma': 0.01, 'kernel': 'sigmoid'} 42 0.855077 0.855072
59 {'C': 100, 'gamma': 1e-05, 'kernel': 'sigmoid'} 42 0.855074 0.855072
23 {'C': 50, 'gamma': 1e-05, 'kernel': 'rbf'} 42 0.855074 0.855072
46 {'C': 10, 'gamma': 0.0001, 'kernel': 'sigmoid'} 42 0.855074 0.855072
64 {'C': 0.001, 'kernel': 'linear'} 42 0.855074 0.855072
55 {'C': 50, 'gamma': 1e-05, 'kernel': 'sigmoid'} 48 0.541061 0.541063
51 {'C': 25, 'gamma': 1e-05, 'kernel': 'sigmoid'} 48 0.541061 0.541063
0 {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'} 48 0.541061 0.541063
43 {'C': 0.1, 'gamma': 1e-05, 'kernel': 'sigmoid'} 48 0.541061 0.541063
1 {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'} 48 0.541061 0.541063
2 {'C': 0.001, 'gamma': 0.0001, 'kernel': 'rbf'} 48 0.541061 0.541063
3 {'C': 0.001, 'gamma': 1e-05, 'kernel': 'rbf'} 48 0.541061 0.541063
5 {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'} 48 0.541061 0.541063
6 {'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'} 48 0.541061 0.541063
7 {'C': 0.1, 'gamma': 1e-05, 'kernel': 'rbf'} 48 0.541061 0.541063
9 {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'} 48 0.541061 0.541063
10 {'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'} 48 0.541061 0.541063
11 {'C': 0.1, 'gamma': 1e-05, 'kernel': 'rbf'} 48 0.541061 0.541063
15 {'C': 10, 'gamma': 1e-05, 'kernel': 'rbf'} 48 0.541061 0.541063
19 {'C': 25, 'gamma': 1e-05, 'kernel': 'rbf'} 48 0.541061 0.541063
32 {'C': 0.001, 'gamma': 0.01, 'kernel': 'sigmoid'} 48 0.541061 0.541063
33 {'C': 0.001, 'gamma': 0.001, 'kernel': 'sigmoid'} 48 0.541061 0.541063
34 {'C': 0.001, 'gamma': 0.0001, 'kernel': 'sigmoid'} 48 0.541061 0.541063
37 {'C': 0.1, 'gamma': 0.001, 'kernel': 'sigmoid'} 48 0.541061 0.541063
38 {'C': 0.1, 'gamma': 0.0001, 'kernel': 'sigmoid'} 48 0.541061 0.541063
39 {'C': 0.1, 'gamma': 1e-05, 'kernel': 'sigmoid'} 48 0.541061 0.541063
41 {'C': 0.1, 'gamma': 0.001, 'kernel': 'sigmoid'} 48 0.541061 0.541063
42 {'C': 0.1, 'gamma': 0.0001, 'kernel': 'sigmoid'} 48 0.541061 0.541063
47 {'C': 10, 'gamma': 1e-05, 'kernel': 'sigmoid'} 48 0.541061 0.541063
35 {'C': 0.001, 'gamma': 1e-05, 'kernel': 'sigmoid'} 48 0.541061 0.541063

72 rows × 4 columns

Estimator ranked one on Mean Test Score:
63 {'C': 1000, 'gamma': 1e-05, 'kernel': 'sigmoid'} 1 0.884061 0.898551 12 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} 1 0.888898 0.898551 22 {'C': 50, 'gamma': 0.0001, 'kernel': 'rbf'} 1 0.884061 0.898551 58 {'C': 100, 'gamma': 0.0001, 'kernel': 'sigmoid'} 1 0.884061 0.898551 45 {'C': 10, 'gamma': 0.001, 'kernel': 'sigmoid'} 1 0.884061 0.898551 66 {'C': 0.1, 'kernel': 'linear'} 6 0.886700 0.893720

In [239]:
m = []
t = []

m.append(svm.SVC(kernel='rbf', C=10, gamma = 10))
t.append("C: 10, gamma: 10, kernel: rbf")

m.append(svm.SVC(kernel='rbf', C=50, gamma = 0.1))
t.append("C: 50, gamma: 0.0001, kernel: rbf")

m.append(svm.SVC(kernel='sigmoid', C=1000, gamma = 1e-05))
t.append("C: 1000, gamma: 1e-05, kernel: sigmoid")

m.append(svm.SVC(kernel='sigmoid', C=100, gamma = 0.0001))
t.append("C: 100, gamma: 0.0001, kernel: sigmoid")

m.append(svm.SVC(kernel='sigmoid', C=10, gamma = 0.1))
t.append("C: 10, gamma: 0.1, kernel: sigmoid")

m.append(svm.SVC(kernel='sigmoid', C=10, gamma = 1))
t.append("C: 10, gamma: 0.1, kernel: sigmoid")

m.append(svm.SVC(kernel='linear', C=0.1))
t.append("C: 0.1, kernel: linear")

m.append(svm.SVC(kernel='linear', C=1))
t.append("C: 1, kernel: linear")

m.append(svm.SVC(kernel='linear', C=10))
t.append("C: 10, kernel: linear")
In [240]:
for m_value, t_value in zip(m,t):
    curr_clf = m_value
    curr_clf.fit(X_train, y_train)
    curr_pred = curr_clf.predict(X_test)
    score = 100 * metrics.accuracy_score(y_test, curr_pred)
    X0, X1 = X_train[:, 0], X_train[:, 1]
    xx, yy = make_meshgrid(X0, X1)
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    plot_contours(ax, curr_clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    # scores x label
    ax.set_xlabel(
        'Sepal length\nScore on Test Set: %.03f' % (score),
         fontsize=13)
    ax.set_ylabel('Sepal width', fontsize=13)
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(
        t_value , fontsize=16)
    plt.show()
    i += 1
In [ ]:
 
In [ ]:
i = 1
for C_value in C:
    for gamma_value in gamma:
        for kernel_value in kernel:
            curr_clf = svm.SVC(kernel='rbf', gamma=gamma_value, C=C_value)
            curr_clf.fit(X_train, y_train)
            curr_pred = curr_clf.predict(X_test)
            score = 100 * metrics.accuracy_score(y_test, curr_pred)
            X0, X1 = X_train[:, 0], X_train[:, 1]
            xx, yy = make_meshgrid(X0, X1)
            fig = plt.figure(figsize=(10, 7))
            ax = fig.add_subplot(111)
            plot_contours(ax, curr_clf, xx, yy,
                          cmap=plt.cm.coolwarm, alpha=0.8)
            ax.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            # scores x label
            ax.set_xlabel(
                'Sepal length\nScore on Test Set: %.03f' % (score),
                fontsize=13)
            ax.set_ylabel('Sepal width', fontsize=13)
            ax.set_xticks(())
            ax.set_yticks(())
            ax.set_title(
                'Plot %d: SVM - RBF with C=%.02f, gamma=%.03f' % (i, C_value, gamma_value), fontsize=16)
            plt.show()
            i += 1

Considerations on decision boundaries! TO DO!

In [ ]: